import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import plotly.express as px
from collections import defaultdict
import json
from collections import Counter
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from IPython.core.display import display, HTML
#%matplotlib inline
## Access to all the data
display(HTML("""<a href="https://github.com/tingtingbun/interactiveds/tree/master/interactive">datafile</a>"""))
### dataset loading
def data_loading (name_file, state_file, us_abbre_file):
df = pd.concat([pd.read_csv(name_file+'/' + i, sep=",",header=None) for i in os.listdir('namesbystate') if not i.startswith('.')]\
,ignore_index=True)
df.columns = ['State','Sex','Year_of_birth','Name','Num_of_occurrences']
print('There are {} states in the dataset and they are {}'.format(len(df.State.unique().tolist()),df.State.unique().tolist()))
### To check all 50 + D.C. are loaded correctly
### loading income dataset
df_income = pd.read_excel(state_file, header=[0])
df_income = pd.melt(df_income, id_vars='State', value_vars=df_income.columns.tolist()[1:])
df_income.rename(columns={"variable": "year", "value":"median_income"}, inplace = True)
### loading state abbrev json
with open (us_abbre_file) as file:
us_code = {y:x for x,y in json.load(file).items()}
### replacing abbreviation
for index, content in df_income.iterrows():
if content['State'] == 'United States':
pass
else:
df_income.loc[index,'State'] = us_code[content['State']]
return pd.merge(df_income, df, left_on = ['State','year'], right_on =['State','Year_of_birth'])
a = data_loading('namesbystate','h08-2.xls','state_code.json')
print('The head 5 rows of the data \n\n' + a.head().to_string() + '\n\n')
print('Basic description of different fields in the integrated dataset \n\n' + a.describe().to_string())
def gender_neutral_visualize (df):
### visualization of name lists with equal distribution between genders
### and understanding the trade off between percentent difference and total number of occurrences
gender_df = df.groupby(['Name','Sex']).agg({'Num_of_occurrences': 'sum'}).unstack().fillna(0)
bi_gender = gender_df.Num_of_occurrences[(gender_df.Num_of_occurrences.F > 0) & (gender_df.Num_of_occurrences.M > 0)]
### Deriving two filds
bi_gender['diff'] = abs(bi_gender.F - bi_gender.M)
bi_gender['percent_dff'] = abs(bi_gender.F - bi_gender.M )/(bi_gender.M+bi_gender.F)
bi_gender['Total_num'] = bi_gender.M+bi_gender.F
absolute_equal = bi_gender[(bi_gender['percent_dff'] == 0)] ### -> this gives a dataframe with names that are equally
absolute_equal.reset_index(inplace = True) ### shared between men and women
### Plotting for equally shared name and the total occurences of the name
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(25,10))
x = absolute_equal.Name.tolist()
y_pos = np.arange(len(x))
y = absolute_equal['Total_num'].tolist()
ax1.bar(y_pos, y, align='center', alpha=0.6)
ax1.set_xticks(y_pos)
ax1.set_xticklabels(x, rotation=90)
ax1.set_ylabel('Num of Occurrences')
ax1.set_title('Absolute Gender Neutral Names')
max_state = absolute_equal[absolute_equal.Total_num==absolute_equal.Total_num.max()].Name.values[0]
print(max_state)
arrow_x = absolute_equal[absolute_equal.Total_num==absolute_equal.Total_num.max()].index.values[0] + 0.5
arrow_y = absolute_equal.Total_num.max() -1
ax1.annotate('max name is {} and the total count is {}'.format(max_state,absolute_equal.Total_num.max()) \
, xy=(arrow_x, arrow_y), xytext=(arrow_x-20, arrow_y), arrowprops=dict(facecolor="black", width=4,headwidth=10, shrink=0.2))
### Plotting for percent difference against total number of occurrences
ax2.scatter(x=bi_gender['percent_dff'],y=bi_gender['Total_num'])
ax2.set_xlabel('Normalized Difference')
ax2.set_ylabel('Num of occurrences')
ax2.set_title('occurrences and Percent_diff Trade-off')
gender_neutral_visualize(a)
def algorithm_calculation (df, location='All'):
gender_df = df.groupby(['Name','Sex']).agg({'Num_of_occurrences': 'sum'}).unstack().fillna(0)
bi_gender = gender_df.Num_of_occurrences[(gender_df.Num_of_occurrences.F > 0) & (gender_df.Num_of_occurrences.M > 0)]
bi_gender['percent_dff'] = abs(bi_gender.F - bi_gender.M )/(bi_gender.M+bi_gender.F)
bi_gender['Total_num'] = bi_gender.M+bi_gender.F ### replicating the aggregated and derived dataframe
bi_gender_ii = bi_gender[(bi_gender['percent_dff']<=0.5) & (bi_gender.index!= 'Unknown')].copy()
bi_gender_ii['percent_diff_rank'] = bi_gender_ii['percent_dff'].rank(ascending=False) ### ranking the total percent_diff, the lower the higher rank score
bi_gender_ii['Total_num_rank'] = bi_gender_ii['Total_num'].rank() ### ranking the total num, the higher the higher rank score
bi_gender_ii['score'] = bi_gender_ii['percent_diff_rank']+0.75*bi_gender_ii['Total_num_rank'] ### combined score calculate by the weighting coefficient
bi_gender_ii.sort_values(by='score',ascending = False, inplace = True)
print(bi_gender_ii.head(10).to_string())
print('\n the top {} suggested highly gender nuetral names in {} are \n {}'.format(len(bi_gender_ii[0:100].index.tolist()),location, bi_gender_ii[0:100].index.tolist()))
algorithm_calculation(a) ### this show name list for all states
### this show name list for each state respectively
#for i in a.State.unique().tolist():
#print('\n')
#print(i)
#algorithm_calculation(a[a.State == i],location = i)
def regression_plot_income_name (df,year = False):
if year == False:
element = ['State']
else:
element = ['State', year]
agg_df = df.groupby(element).agg({'Name':'count','median_income':'mean'})
agg_df.rename(columns = {'Name':'count of name'}, inplace = True)
agg_df['state'] = agg_df.index
return sns.lmplot(y='median_income', x='count of name',data=agg_df)
regression_plot_income_name(a)
regression_plot_income_name(a, 'year')
agg_df = a.groupby(['State', 'year']).agg({'Name':'count','median_income':'mean'})
agg_df.rename(columns = {'Name':'count of name'}, inplace = True)
agg_df['state'] = agg_df.index
y = agg_df.median_income.values.tolist()
x = agg_df['count of name']
model = sm.OLS(x,y)
results = model.fit()
slope = results.params[0]
p_value = stats.t.sf(results.tvalues[0], results.df_resid)
print('The regression of total count of uique names on state median income \n t value : {} and p value of {}'.format(results.tvalues[0],p_value))
### Approach 1 a.time series plot of the number of unique names each state has
def name_volumn_visualize (df):
new_df = df.groupby(['Year_of_birth','State']).agg({'Name': 'count'})
new_df ['year']= new_df.index.get_level_values('Year_of_birth')
new_df ['volumn']= new_df['Name']
new_df ['state'] = new_df.index.get_level_values('State')
new_df = new_df.droplevel(['Year_of_birth'])
fig = px.line(new_df,x='year',y='volumn',color='state')
fig.show(renderer="notebook")
name_volumn_visualize (a) ### You can hover over any line to see what state it corresponds to
### Approach 1 b.histogram of the volumn of the names each state was the first to adopt
def first_setter (df):
new = df.groupby('Name').agg({'Year_of_birth': 'min'})
new['Name_min']=new.index
new_df = pd.merge(left=new,right=a, how = 'inner', left_on=['Name_min','Year_of_birth'], right_on=['Name','Year_of_birth'])
### plotting
new_df = new_df.groupby('State').agg({'Name_min':'count'})
new_df['State']= new_df.index
fig = px.bar(x=new_df['State'], y=new_df['Name_min'])
fig.show(render = 'notebook')
first_setter(a)
### Rationale for choosing 500
for i in [100,200,500,800,1000,2000]:
name_df = a.groupby('Name').agg({'Num_of_occurrences':'sum'}) ### identify top 1000 names
name_df.sort_values(by='Num_of_occurrences',ascending=False, inplace = True)
capture = name_df[0:i].Num_of_occurrences.sum()/name_df.Num_of_occurrences.sum()*100
print('the top {} names capture {}% percent of the total names'.format(i,capture))
digit1 = name_df[0:500].Num_of_occurrences.sum()
total = name_df.Num_of_occurrences.sum()
fig1, ax1 = plt.subplots()
ax1.pie([digit1,total-digit1], labels=['500 Names','Remaining Names'], autopct='%1.1f%%',shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
### Approach 2: a. list of top 500 names
### b. the percentage of babies who used these 500 names in each state
def popular_converage(df):
name_df = df.groupby('Name').agg({'Num_of_occurrences':'sum'}) ### identify top 1000 names
name_df.sort_values(by='Num_of_occurrences',ascending=False, inplace = True)
top_500 = name_df[0:500].index.tolist()
df_top_500 = df[df.Name.isin(top_500)]
count_df = df_top_500.groupby('State').agg({'Name':'count'})
total_occur = a.groupby('State').agg({'Num_of_occurrences':'sum'})
count_df = pd.merge(count_df, total_occur,left_index=True, right_index=True)
count_df['Name_percent'] = count_df['Name']/count_df['Num_of_occurrences']
count_df.reset_index(inplace = True)
### plotting the bar chart of the top 500 words /num of occurences
plt.figure(figsize=(15,5))
x = count_df.State.tolist()
y_pos = np.arange(len(x))
y = count_df['Name_percent'].tolist()
plt.bar(y_pos, y, align='center', alpha=0.6)
plt.xticks(y_pos,x,rotation=90)
plt.ylabel('percentage of total occurrences')
plt.title('Percentage of Babies with the Top 500 Names across States',fontsize=20)
max_state = count_df[count_df['Name_percent']==count_df['Name_percent'].max()].State.values[0]
arrow_x = count_df[count_df['Name_percent']==count_df['Name_percent'].max()].index.values[0] + 0.5
arrow_y = count_df['Name_percent'].max()
plt.annotate('max state is {} and the total count is {}'.format(max_state,arrow_y) \
, xy=(arrow_x, arrow_y), xytext=(arrow_x+3, arrow_y), arrowprops=dict(facecolor="black", width=3,headwidth=10, shrink=0.2))
popular_converage(a)